library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
numberFamilies <- 10000
famsLoc <- "../RObjects/family_dfs/individualDataFrames/no_censoring"

families <- list()
i=1
for(file in list.files(famsLoc)){
  load(str_interp("${famsLoc}/${file}"))
  families[[i]]=family
  i=i+1
  
}

#load in the summary table of PanelPro results
load(str_interp("../RObjects/summary_tables/summaryTable10000Families_no_censoring.Rdata"))

probandAge <- c()
for(i in 1:length(families)){
  proband = families[[i]] %>% filter(isProband==1) 
  probandAge <- c(probandAge, proband$CurAge)
}
summaryTable$probandAge <- probandAge
summaryTable<- summaryTable %>% filter(carrierRiskUnaffectedInfoMasked < 1)
summaryTable<- summaryTable %>% filter(fullCarrierRisk < 1)
print(nrow(summaryTable))
## [1] 9646
logit <- function(x){
  log(x)-log(1-x)
}

#offset <- 0.00001
summaryTable$logitFullCarrierRisk <- logit(summaryTable$fullCarrierRisk)
summaryTable$logitCarrierRiskUnaffectedInfoMasked <- logit(summaryTable$carrierRiskUnaffectedInfoMasked)
#logit spreads out those lower values and higher values
summaryTable$numAffectedRelativesIndicator0 <- ifelse(summaryTable$numAffectedRelatives == 0, 1, 0)
summaryTable$numAffectedRelativesIndicator1 <- ifelse(summaryTable$numAffectedRelatives == 1, 1, 0)
summaryTable$numAffectedRelativesIndicator2 <- ifelse(summaryTable$numAffectedRelatives == 2, 1, 0) 
summaryTable$numAffectedRelativesIndicator3 <- ifelse(summaryTable$numAffectedRelatives >= 3, 1, 0) 

summaryTable$group <- ifelse(summaryTable$numAffectedRelativesIndicator0 == 1, "0",
                                  ifelse(summaryTable$numAffectedRelativesIndicator1 == 1, "1",
                                  ifelse(summaryTable$numAffectedRelativesIndicator2 == 1, "2",
                                  ifelse(summaryTable$numAffectedRelativesIndicator3 == 1, "3+","NA"))))

#checking for correlations that may induce collinearity
cor_df <- summaryTable[, c("logitFullCarrierRisk", "logitCarrierRiskUnaffectedInfoMasked", "numAffectedRelativesIndicator1", "numAffectedRelativesIndicator2","numAffectedRelativesIndicator3")] 
print(cor(cor_df))
##                                      logitFullCarrierRisk
## logitFullCarrierRisk                            1.0000000
## logitCarrierRiskUnaffectedInfoMasked            0.8454426
## numAffectedRelativesIndicator1                 -0.2066505
## numAffectedRelativesIndicator2                 -0.2272025
## numAffectedRelativesIndicator3                  0.3527884
##                                      logitCarrierRiskUnaffectedInfoMasked
## logitFullCarrierRisk                                            0.8454426
## logitCarrierRiskUnaffectedInfoMasked                            1.0000000
## numAffectedRelativesIndicator1                                 -0.2291213
## numAffectedRelativesIndicator2                                 -0.2065293
## numAffectedRelativesIndicator3                                  0.3671385
##                                      numAffectedRelativesIndicator1
## logitFullCarrierRisk                                     -0.2066505
## logitCarrierRiskUnaffectedInfoMasked                     -0.2291213
## numAffectedRelativesIndicator1                            1.0000000
## numAffectedRelativesIndicator2                           -0.0827825
## numAffectedRelativesIndicator3                           -0.5392998
##                                      numAffectedRelativesIndicator2
## logitFullCarrierRisk                                     -0.2272025
## logitCarrierRiskUnaffectedInfoMasked                     -0.2065293
## numAffectedRelativesIndicator1                           -0.0827825
## numAffectedRelativesIndicator2                            1.0000000
## numAffectedRelativesIndicator3                           -0.7108671
##                                      numAffectedRelativesIndicator3
## logitFullCarrierRisk                                      0.3527884
## logitCarrierRiskUnaffectedInfoMasked                      0.3671385
## numAffectedRelativesIndicator1                           -0.5392998
## numAffectedRelativesIndicator2                           -0.7108671
## numAffectedRelativesIndicator3                            1.0000000

Families with First Degree Affected Relatives

summaryTableFirstDegAff <- summaryTable %>% filter(firstDegreeAffectedFamilyMembersBinary == 1)
nrow(summaryTableFirstDegAff)
## [1] 3759

Using Indicator Variables

#take out the 0 indicator because we are restricting to families with >= 1 affected relative

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked  + numAffectedRelativesIndicator1 + numAffectedRelativesIndicator2, data=summaryTableFirstDegAff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     numAffectedRelativesIndicator1 + numAffectedRelativesIndicator2, 
##     data = summaryTableFirstDegAff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4137  -2.3804  -0.3293   3.1184   7.1742 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -4.32378    0.05324 -81.216  < 2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  2.38396    0.02763  86.278  < 2e-16 ***
## numAffectedRelativesIndicator1       -1.42154    0.37640  -3.777 0.000161 ***
## numAffectedRelativesIndicator2       -2.24691    0.23491  -9.565  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.01 on 3755 degrees of freedom
## Multiple R-squared:  0.694,  Adjusted R-squared:  0.6937 
## F-statistic:  2838 on 3 and 3755 DF,  p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point() +
  geom_line(aes(y = predict(fitLinear), color = as.factor(group))) +
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "Number of Affected Relatives") 

res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)

# get predicted values for training data
expit <- function(x){
  exp(x) / (1 + exp(x))
}

summaryTableFirstDegAff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0971821656232819"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0742977280350966"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"

Without Using Indicator Variables

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked , data=summaryTableFirstDegAff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked, 
##     data = summaryTableFirstDegAff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5343  -2.3744  -0.5196   3.1562   7.4438 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -4.40967    0.05323  -82.84   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  2.45110    0.02707   90.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.049 on 3757 degrees of freedom
## Multiple R-squared:  0.6857, Adjusted R-squared:  0.6856 
## F-statistic:  8196 on 1 and 3757 DF,  p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point(aes(color = probandMLH1Status)) +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "Proband MLH1")

res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)

# make a qq plot to assess normality

qqnorm(res)
qqline(res) 

# get predicted values for training data
summaryTableFirstDegAff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0973747492407199"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.074229639381619"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"

Testing Covariates: Number of Affected Relatives, Age of Proband

Covariate: Number of Affected Relatives

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + numAffectedRelatives , data=summaryTableFirstDegAff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     numAffectedRelatives, data = summaryTableFirstDegAff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.3863  -1.5190  -0.0745   1.4875   6.5016 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -8.607774   0.086106  -99.97   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  1.678548   0.024586   68.27   <2e-16 ***
## numAffectedRelatives                  0.360479   0.006563   54.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.271 on 3756 degrees of freedom
## Multiple R-squared:  0.8257, Adjusted R-squared:  0.8256 
## F-statistic:  8896 on 2 and 3756 DF,  p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point(aes(color = as.factor(numAffectedRelatives))) +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", subtitle="First Degree Family Members Affected, Covariate: Number of Affected Relatives", color = "Number of Affected Relatives")

# get predicted values for training data
summaryTableFirstDegAff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.128149495702171"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0622038078710044"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"

Covariate: Proband Age

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge , data=summaryTableFirstDegAff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     probandAge, data = summaryTableFirstDegAff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5343  -2.3744  -0.5196   3.1562   7.4438 
## 
## Coefficients: (1 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -4.40967    0.05323  -82.84   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  2.45110    0.02707   90.53   <2e-16 ***
## probandAge                                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.049 on 3757 degrees of freedom
## Multiple R-squared:  0.6857, Adjusted R-squared:  0.6856 
## F-statistic:  8196 on 1 and 3757 DF,  p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point(aes(color =probandAge)) +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Affected, Covariate: Proband Age", color = "Proband Age")

Covariates: Number of Affected Relatives and Proband Age

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge + numAffectedRelatives , data=summaryTableFirstDegAff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     probandAge + numAffectedRelatives, data = summaryTableFirstDegAff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.3863  -1.5190  -0.0745   1.4875   6.5016 
## 
## Coefficients: (1 not defined because of singularities)
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -8.607774   0.086106  -99.97   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  1.678548   0.024586   68.27   <2e-16 ***
## probandAge                                  NA         NA      NA       NA    
## numAffectedRelatives                  0.360479   0.006563   54.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.271 on 3756 degrees of freedom
## Multiple R-squared:  0.8257, Adjusted R-squared:  0.8256 
## F-statistic:  8896 on 2 and 3756 DF,  p-value: < 2.2e-16

Proband age again seems statistically insignificant

ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point() +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Affected, Covariates: Proband Age and Number of Affected Relatives", color = "Proband Age")

Families with No Affected First Degree Relatives

summaryTableFirstDegUnaff <- summaryTable %>% filter(firstDegreeAffectedFamilyMembersBinary == 0)
nrow(summaryTableFirstDegUnaff)
## [1] 5887

Using Indicator Variables

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked  + numAffectedRelativesIndicator0 + numAffectedRelativesIndicator1 + numAffectedRelativesIndicator2, data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     numAffectedRelativesIndicator0 + numAffectedRelativesIndicator1 + 
##     numAffectedRelativesIndicator2, data = summaryTableFirstDegUnaff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0761 -1.0164 -0.0547  0.9454  7.2737 
## 
## Coefficients:
##                                      Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                          -7.20780    0.04644 -155.208  < 2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  1.59751    0.02027   78.822  < 2e-16 ***
## numAffectedRelativesIndicator0       -0.19762    0.12099   -1.633    0.102    
## numAffectedRelativesIndicator1       -0.58210    0.07831   -7.433 1.21e-13 ***
## numAffectedRelativesIndicator2       -0.62772    0.06377   -9.844  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.584 on 5882 degrees of freedom
## Multiple R-squared:  0.5881, Adjusted R-squared:  0.5879 
## F-statistic:  2100 on 4 and 5882 DF,  p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point() +
  geom_line(aes(y = predict(fitLinear), color = as.factor(group))) +
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "Number of Affected Relatives") 

res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)

# get predicted values for training data
expit <- function(x){
  exp(x) / (1 + exp(x))
}

summaryTableFirstDegUnaff$predicted <- expit(predict(fitLinear))

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.00055016849877575"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00150412237671034"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"

Without Using Indicator Variables

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked, 
##     data = summaryTableFirstDegUnaff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0394 -1.0562 -0.1008  0.9294  7.2865 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -7.17585    0.04630 -154.99   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  1.67414    0.01861   89.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.601 on 5885 degrees of freedom
## Multiple R-squared:  0.5791, Adjusted R-squared:  0.579 
## F-statistic:  8096 on 1 and 5885 DF,  p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point(aes(color=probandMLH1Status)) +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information Linear Regression", subtitle="First Degree Family Members Unaffected", color = "Proband MLH1")

res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)

# make a qq plot to assess normality

qqnorm(res)
qqline(res) 

# get predicted values for training data

summaryTableFirstDegUnaff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.000662526614050893"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00148619474565262"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"

Testing Covariates: Number of Affected Relatives, Age of Proband

Covariate: Number of Affected Relatives

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + numAffectedRelatives , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     numAffectedRelatives, data = summaryTableFirstDegUnaff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2152 -0.9842  0.0017  0.9602  5.6139 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -8.625417   0.069115 -124.80   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  1.441611   0.019539   73.78   <2e-16 ***
## numAffectedRelatives                  0.196898   0.007277   27.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.51 on 5884 degrees of freedom
## Multiple R-squared:  0.6257, Adjusted R-squared:  0.6255 
## F-statistic:  4917 on 2 and 5884 DF,  p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point(aes(color = as.factor(numAffectedRelatives))) +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", subtitle="First Degree Family Members Unaffected, Covariate: Number of Affected Relatives", color = "Number of Affected Relatives")

### Covariate: Proband Age

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     probandAge, data = summaryTableFirstDegUnaff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0394 -1.0562 -0.1008  0.9294  7.2865 
## 
## Coefficients: (1 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -7.17585    0.04630 -154.99   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  1.67414    0.01861   89.98   <2e-16 ***
## probandAge                                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.601 on 5885 degrees of freedom
## Multiple R-squared:  0.5791, Adjusted R-squared:  0.579 
## F-statistic:  8096 on 1 and 5885 DF,  p-value: < 2.2e-16

Here the proband age seems more statistically signficant for families that do not have an affected first degree relative

ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point(aes(color =probandAge)) +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Unaffected, Covariate: Proband Age", color = "Proband Age")

Covariates: Number of Affected Relatives and Proband Age

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge + numAffectedRelatives , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + 
##     probandAge + numAffectedRelatives, data = summaryTableFirstDegUnaff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2152 -0.9842  0.0017  0.9602  5.6139 
## 
## Coefficients: (1 not defined because of singularities)
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -8.625417   0.069115 -124.80   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  1.441611   0.019539   73.78   <2e-16 ***
## probandAge                                  NA         NA      NA       NA    
## numAffectedRelatives                  0.196898   0.007277   27.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.51 on 5884 degrees of freedom
## Multiple R-squared:  0.6257, Adjusted R-squared:  0.6255 
## F-statistic:  4917 on 2 and 5884 DF,  p-value: < 2.2e-16

Proband age seems statistically significant here as well

ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point() +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Unaffected, Covariates: Proband Age and Number of Affected Relatives", color = "Proband Age")

# get predicted values for training data

summaryTableFirstDegUnaff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.000751750339751532"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00124419275115061"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"

All families

First we will run a linear regression model on all families generated.

fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked , data=summaryTable)
print(summary(fitLinear))
## 
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked, 
##     data = summaryTable)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5057 -1.5978 -0.2705  1.3052  7.9860 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -5.18092    0.03501  -148.0   <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked  2.36741    0.01523   155.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.427 on 9644 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7147 
## F-statistic: 2.417e+04 on 1 and 9644 DF,  p-value: < 2.2e-16
residuals <- residuals(fitLinear)
qqnorm(residuals)
qqline(residuals)

#the residuals should fit to the line in the qqplot if the data is normally distributed

range(summaryTable$fullCarrierRisk)
## [1] 7.020627e-08 9.999384e-01
range(summaryTable$carrierRiskUnaffectedInfoMasked)
## [1] 0.003292563 0.999862461
range(summaryTable$logitFullCarrierRisk)
## [1] -16.471828   9.695422
range(summaryTable$logitCarrierRiskUnaffectedInfoMasked)
## [1] -5.712791  8.891468
ggplot(data = summaryTable, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
  geom_point(aes(color = as.factor(firstDegreeAffectedFamilyMembersBinary))) +
  geom_line(aes(y = predict(fitLinear)))+
  labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "1st Deg Aff")

# get predicted values for training data

summaryTable$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0336185884164781"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0362840655982899"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"

Other models

Loess Regression

All Families

fitLoess <- loess(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTable, span = 0.2)
summary(fitLoess) #RSE is 0.00397 (seems good but I'm not sure)
## Call:
## loess(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, 
##     data = summaryTable, span = 0.2)
## 
## Number of Observations: 9646 
## Equivalent Number of Parameters: 15.72 
## Residual Standard Error: 0.09717 
## Trace of smoother matrix: 17.38  (exact)
## 
## Control settings:
##   span     :  0.2 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE
loessVis = ggplot(data = summaryTable, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
  geom_point(aes(color = probandMLH1Status)) +
  geom_line(aes(y = predict(fitLoess)))+
  labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression")

print(loessVis)

# get predicted values for training data

summaryTable$predicted <- predict(fitLoess) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0617798157971618"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0245539261430636"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"

First Degree Affected Families

fitLoess <- loess(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegAff, span = 0.2)
summary(fitLoess) #RSE is 0.00397 (seems good but I'm not sure)
## Call:
## loess(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, 
##     data = summaryTableFirstDegAff, span = 0.2)
## 
## Number of Observations: 3759 
## Equivalent Number of Parameters: 15.49 
## Residual Standard Error: 0.1202 
## Trace of smoother matrix: 17.13  (exact)
## 
## Control settings:
##   span     :  0.2 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE
loessVis = ggplot(data = summaryTableFirstDegAff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
  geom_point(aes(color = probandMLH1Status)) +
  geom_line(aes(y = predict(fitLoess)))+
  labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression", subtitle="First Degree Family Members Affected")

print(loessVis)

# get predicted values for training data
summaryTableFirstDegAff$predicted <- predict(fitLoess)

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.162655605163222"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0515684041885317"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"

First Degree Unaffected Families

fitLoess <- loess(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegUnaff, span = 0.2)
summary(fitLoess) 
## Call:
## loess(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, 
##     data = summaryTableFirstDegUnaff, span = 0.2)
## 
## Number of Observations: 5887 
## Equivalent Number of Parameters: 16.43 
## Residual Standard Error: 0.02714 
## Trace of smoother matrix: 18.16  (exact)
## 
## Control settings:
##   span     :  0.2 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE
loessVis = ggplot(data = summaryTableFirstDegUnaff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
  geom_point(aes(color = probandMLH1Status)) +
  geom_line(aes(y = predict(fitLoess)))+
  labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression", subtitle="First Degree Family Members Affected")

print(loessVis)

# get predicted values for training data

summaryTableFirstDegUnaff$predicted <- predict(fitLoess)

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.00224151612905406"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00148496323184557"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"

Median Polish

All Families

fitMedianPolished <- rlm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTable)
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
print(summary(fitMedianPolished))
## 
## Call: rlm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, 
##     data = summaryTable)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.169798 -0.015675  0.002148  0.013571  0.820718 
## 
## Coefficients:
##                                 Value    Std. Error t value 
## (Intercept)                      -0.0211   0.0003   -62.2679
## carrierRiskUnaffectedInfoMasked   0.2004   0.0011   185.2869
## 
## Residual standard error: 0.02109 on 9644 degrees of freedom
ggplot(data = summaryTable, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
  geom_point(aes(color = probandMLH1Status)) +
  geom_line(aes(y = predict(fitMedianPolished)))+
  labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression")

# get predicted values for training data

summaryTable$predicted <- predict(fitMedianPolished) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0223329738900075"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0500296559514577"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"

First Degree Affected Families

fitMedianPolished <- rlm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegAff)
print(summary(fitMedianPolished))
## 
## Call: rlm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, 
##     data = summaryTableFirstDegAff)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83388 -0.07492 -0.01929  0.10017  0.23967 
## 
## Coefficients:
##                                 Value    Std. Error t value 
## (Intercept)                      -0.1876   0.0033   -57.0199
## carrierRiskUnaffectedInfoMasked   1.0692   0.0075   141.9900
## 
## Residual standard error: 0.1227 on 3757 degrees of freedom
ggplot(data = summaryTableFirstDegAff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
  geom_point(aes(color = probandMLH1Status)) +
  geom_line(aes(y = predict(fitMedianPolished)))+
  labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression", subtitle="First Degree Family Members Affected")

# get predicted values for training data
summaryTableFirstDegAff$predicted <- predict(fitMedianPolished) #need to do the expit to undo the logit transformation

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.176700240328606"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0629664172225647"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"

First Degree Unaffected Families

fitMedianPolished <- rlm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegUnaff)
print(summary(fitMedianPolished))
## 
## Call: rlm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, 
##     data = summaryTableFirstDegUnaff)
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.211e-04 -4.547e-05  5.476e-06  3.295e-05  9.926e-01 
## 
## Coefficients:
##                                 Value    Std. Error t value 
## (Intercept)                      -0.0001   0.0000   -36.5957
## carrierRiskUnaffectedInfoMasked   0.0011   0.0000   144.7512
## 
## Residual standard error: 5.707e-05 on 5885 degrees of freedom
ggplot(data = summaryTableFirstDegUnaff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
  geom_point(aes(color = probandMLH1Status)) +
  geom_line(aes(y = predict(fitMedianPolished)))+
  labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Median Polished Regression", subtitle="First Degree Family Members Unaffected")

# get predicted values for training data

summaryTableFirstDegUnaff$predicted <- predict(fitMedianPolished) 

# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 9.21074572200889e-05"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00169588917621271"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"

Ad Hoc Approach

summaryTable$lowScoreIndicator <- ifelse(summaryTable$fullCarrierRisk < 0.1, 1, 0)
 

summaryTable$groupLowScore <- ifelse(summaryTable$lowScoreIndicator == 1, "0",
                                  ifelse(summaryTable$lowScoreIndicator == 1, "1","NA"))

fit <- lm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked + lowScoreIndicator, data= summaryTable)
print(summary(fit))
## 
## Call:
## lm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked + 
##     lowScoreIndicator, data = summaryTable)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62449 -0.01339  0.00924  0.02301  0.25211 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      0.514848   0.005499   93.62   <2e-16 ***
## carrierRiskUnaffectedInfoMasked  0.233011   0.005915   39.39   <2e-16 ***
## lowScoreIndicator               -0.548084   0.004811 -113.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08787 on 9643 degrees of freedom
## Multiple R-squared:  0.8314, Adjusted R-squared:  0.8314 
## F-statistic: 2.378e+04 on 2 and 9643 DF,  p-value: < 2.2e-16
ggplot(data = summaryTable, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
  geom_point() +
  geom_line(aes(y = predict(fit)))+
  labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "MLH1 Carrier Risk for Masked Info and Full Family Information", subtitle="First Degree Family Members Unaffected")

# get predicted values for training data

summaryTable$predicted <- predict(fit)

# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0637092689623149"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0222027186254911"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"